
### -------------------------------------------------- 
### ---- General Set Up ---- 
### -------------------------------------------------- 

### Clear global environment
rm(list=ls()) 


### Libraries:
library(pacman)
p_load(tidyverse, foreign, readstata13, readr, here)


### File Location (R project only)
here::i_am("Survey Only Match/105_match_tfa_to_state_phone.R")
file_path <- here()


### -------------------------------------------------- 
### ---- Download TFA Data ----
### -------------------------------------------------- 

### Download Data
tfa_dat <- read.dta13(paste0(file_path,"/Data/tfa_data.dta"))

### -------------------------------------------------- 
### ---- Area Code to State ----
### -------------------------------------------------- 

### -------------------------------------------------- 
### Clean Area Codes in TFA dataset for Merging
### -------------------------------------------------- 


# repeating each step for the home, work, and cell phone numbers...

tfa_dat_wac <- tfa_dat %>%
  #remove whitespace
  mutate(homephone_clean = gsub(" ", "", homephone)) %>%
  mutate(workphone_clean = gsub(" ", "", workphone)) %>%
  mutate(cellphone_clean = gsub(" ", "", cellphone)) %>%
  #strip inappropriate characters
  mutate(homephone_clean = textclean::strip(homephone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  mutate(workphone_clean = textclean::strip(workphone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  mutate(cellphone_clean = textclean::strip(cellphone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  #grab area code
  mutate(homephone_area_code = str_sub(homephone_clean, start=1, end=3)) %>%
  mutate(workphone_area_code = str_sub(workphone_clean, start=1, end=3)) %>%
  mutate(cellphone_area_code = str_sub(cellphone_clean, start=1, end=3)) %>%
  #grab remainder of phone number
  mutate(homephone_number = str_sub(homephone_clean, start=4, end=10)) %>%
  mutate(workphone_number = str_sub(workphone_clean, start=4, end=10)) %>%
  mutate(cellphone_number = str_sub(cellphone_clean, start=4, end=10)) %>%
  #remove country codes and blank out numbers with other non-standard lengths
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)>10,NA,homephone_area_code)) %>%
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)==11 & str_sub(homephone_clean, start=1, end=1) == "1",
                                      str_sub(homephone_clean, start=2, end=4),homephone_area_code)) %>% 
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)==12 & str_sub(homephone_clean, start=1, end=2) == "01",
                                      str_sub(homephone_clean, start=3, end=5),homephone_area_code)) %>% 
  mutate(homephone_number = ifelse(nchar(homephone_clean)>10,NA,homephone_number)) %>%
  mutate(homephone_number = ifelse(nchar(homephone_clean)==11 & str_sub(homephone_clean, start=1, end=1) == "1",
                                      str_sub(homephone_clean, start=5, end=11),homephone_number)) %>% 
  mutate(homephone_number = ifelse(nchar(homephone_clean)==12 & str_sub(homephone_clean, start=1, end=2) == "01",
                                      str_sub(homephone_clean, start=6, end=12),homephone_number)) %>%
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)>10,NA,workphone_area_code)) %>%
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)==11 & str_sub(workphone_clean, start=1, end=1) == "1",
                                      str_sub(workphone_clean, start=2, end=4),workphone_area_code)) %>% 
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)==12 & str_sub(workphone_clean, start=1, end=2) == "01",
                                      str_sub(workphone_clean, start=3, end=5),workphone_area_code)) %>% 
  mutate(workphone_number = ifelse(nchar(workphone_clean)>10,NA,workphone_number)) %>%
  mutate(workphone_number = ifelse(nchar(workphone_clean)==11 & str_sub(workphone_clean, start=1, end=1) == "1",
                                   str_sub(workphone_clean, start=5, end=11),workphone_number)) %>% 
  mutate(workphone_number = ifelse(nchar(workphone_clean)==12 & str_sub(workphone_clean, start=1, end=2) == "01",
                                   str_sub(workphone_clean, start=6, end=12),workphone_number)) %>%
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)>10,NA,cellphone_area_code)) %>%
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)==11 & str_sub(cellphone_clean, start=1, end=1) == "1",
                                      str_sub(cellphone_clean, start=2, end=4),cellphone_area_code)) %>% 
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)==12 & str_sub(cellphone_clean, start=1, end=2) == "01",
                                      str_sub(cellphone_clean, start=3, end=5),cellphone_area_code)) %>% 
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)>10,NA,cellphone_number)) %>%
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)==11 & str_sub(cellphone_clean, start=1, end=1) == "1",
                                   str_sub(cellphone_clean, start=5, end=11),cellphone_number)) %>% 
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)==12 & str_sub(cellphone_clean, start=1, end=2) == "01",
                                   str_sub(cellphone_clean, start=6, end=12),cellphone_number))


### Create variable with phone number area code
tfa_dat_wac %<>%
  dplyr::mutate(area_code_home = cellphone_area_code) %>%
  dplyr::mutate(area_code_work = workphone_area_code) %>%
  dplyr::mutate(area_code_cell = homephone_area_code)


### -------------------------------------------------- 
### Read list of area codes in USA (downloaded 3/26/2018)
### Source: https://github.com/ravisorg/Area-Code-Geolocation-Database
### -------------------------------------------------- 

### Path to csv file 
area_codes_path <- paste0(file_path,"/Data/us-area-code-cities.csv") 
### Read csv
area_codes <- read_csv(area_codes_path,
                       col_names = FALSE, na = c("", "NA")) %>%
  dplyr::rename(area_code = X1) %>% # Rename Vars
  dplyr::rename(city_name = X2) %>%
  dplyr::rename(state = X3) %>%
  dplyr::rename(country = X4) %>%
  dplyr::rename(latitude = X5) %>%
  dplyr::rename(longitude = X6) %>%
  dplyr::mutate(matches = 1) # Create variable to track successful matches

### Subset & Prep for matching
area_codes <- area_codes %>% 
  dplyr::distinct(area_code, state, matches) %>% # Unique list of area codes and states
  dplyr::mutate(area_code_home = as.character(area_code)) %>% # Create var for each phone type
  dplyr::mutate(area_code_work = as.character(area_code)) %>% 
  dplyr::mutate(area_code_cell = as.character(area_code))

### Convert state names to abbreviations
data(state)
area_codes$state_abb <- state.abb[match(area_codes$state,state.name)] #Match with state data
area_codes %>% filter(is.na(state_abb)) #Only DC was not matched
area_codes %<>% 
  mutate(state_abb = ifelse(state=="District of Columbia","DC",state_abb)) %>%
  rename(state_name = state) %>%
  rename(state = state_abb) %>%
  print()
area_codes %>% filter(is.na(state)) #Check all matched


### Test to confirm that no area codes are assoicated with 2+ states
#test <- area_codes %>% dplyr::filter( ( duplicated(area_code,fromLast=F) | 
#                                        duplicated(area_code,fromLast=T) ) ) %>% print()


### -------------------------------------------------- 
### Join TFA area code list with area code dataset
### -------------------------------------------------- 

### Match states by area code for each phone num type
tfa_dat_acmatch <- left_join(x = tfa_dat_wac %>% select(-state), 
                             y = area_codes[,c("state","area_code_home","matches")], 
                             by = "area_code_home") %>%
  dplyr::rename(state_ac_home = state) %>%
  dplyr::rename(matches_ac_home = matches)

tfa_dat_acmatch <- left_join(x = tfa_dat_acmatch, 
                             y = area_codes[,c("state","area_code_work","matches")], 
                             by = "area_code_work") %>%
  dplyr::rename(state_ac_work = state) %>%
  dplyr::rename(matches_ac_work = matches)

tfa_dat_acmatch <- left_join(x = tfa_dat_acmatch, 
                             y = area_codes[,c("state","area_code_cell","matches")], 
                             by = "area_code_cell") %>%
  dplyr::rename(state_ac_cell = state) %>%
  dplyr::rename(matches_ac_cell = matches)


### Create new variable with count of number of phone numbers provided
tfa_dat_acmatch <- tfa_dat_acmatch %>%
  mutate(matches_ac = as.numeric(!is.na(tfa_dat_acmatch[,"matches_ac_home"])) + 
           as.numeric(!is.na(tfa_dat_acmatch[,"matches_ac_work"])) + 
           as.numeric(!is.na(tfa_dat_acmatch[,"matches_ac_cell"])) )

#---
# Findings - Number of Phone Numbers Provided:
# (0) Obs w/ 0 phone number(s) (acmatch_sum[1,2]) = 2,597   ( 2.157%)
# (1) Obs w/ 1 phone number(s) (acmatch_sum[2,2]) = 60,539  (50.274%)
# (2) Obs w/ 2 phone number(s) (acmatch_sum[3,2]) = 57,279  (47.567%)
# (3) Obs w/ 3 phone number(s) (acmatch_sum[4,2]) = 2       ( 0.002%)
#
# Finding: The only two (2) people with 3 numbers are also the only 
# two (2) with work numbers
#
# Findings - Multiple States
# (4) Obs w/ 2 numbers, different states  = 1,912 (  3.32% of (2) above)
# (5) Obs w/ 3 numbers, different states  = 2     (100.00% of (3) above)
# (6) Obs 2/ 2+ numbers, different states = 1,914 (  1.59% of total_obs)
#
#---


### -------------------------------------------------- 
### Finally, Collapse into State Variable (FINAL_state_phone)
### -------------------------------------------------- 

###Create final 
###Note: uses fact that the same 2 people are the only ones with 3 numbers and work numbers
tfa_dat_acmatch_FINAL <- tfa_dat_acmatch %>%
  #First, prioritize matches_ac_cell
  mutate(FINAL_state_phone = state_ac_cell) %>%
  #Second, if no match, use matches_ac_home 
  mutate(FINAL_state_phone = ifelse(is.na(state_ac_cell), 
                                    state_ac_home, 
                                    FINAL_state_phone)) %>%
  #Third, create secondary match
  mutate(FINAL_state_phone_alt = NA) %>%
  mutate(FINAL_state_phone_alt = ifelse(!is.na(FINAL_state_phone)&(matches_ac==2), 
                                        state_ac_home,
                                        FINAL_state_phone_alt)) %>%
  mutate(FINAL_state_phone_alt = ifelse(!is.na(FINAL_state_phone)&(matches_ac==3), 
                                        state_ac_work, 
                                        FINAL_state_phone_alt))


### -------------------------------------------------- 
### ---- Saved Matched Data ---- 
### -------------------------------------------------- 

save(tfa_dat_acmatch_FINAL, 
     file=paste0(file_path,"/Survey Only Match/Temp_Data/tfa_to_state_phone.RData"))


